library(knitr)
library(data.table)
library(plyr)
library(dplyr)
library(Rtsne)
library(pca3d)
library(scatterplot3d)
library(rgl)
library(kernlab)
library(caret)
For this assignment I will use data from the Kaggle competition “Give me some credit”.
This competition requires participants to improve on the state of the art in credit scoring, by predicting the probability that somebody will experience financial distress in the next two years.
I prepared this data in the assignment of preparacion de datos en R, see other html file.
setwd("/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica")
mydata <- read.csv("Train.csv")
mydata %>% head() %>% kable(row.names = F)
| SD | Age | RUUL | DebtRatio | OpenCredit | REloans | Dep | no3059 | LateDays | logIncome | Debt | Slack |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | -0.4939023 | 1.2448585 | 1.8402257 | 0.8836540 | 4.4647142 | 1.1416911 | 2.5165915 | 0.6509246 | 0.6966108 | 3.3016944 | -0.8600639 |
| 0 | -0.8323977 | 1.7805980 | -0.8373704 | -0.8652939 | -0.9119619 | 0.2375055 | -0.3518966 | -0.3103085 | -0.8503891 | -1.0156682 | -0.6902144 |
| 0 | -0.9677959 | 0.9421161 | -0.9818942 | -1.2539490 | -0.9119619 | -0.6666802 | 1.0823475 | 1.6121577 | -0.6568490 | -1.0513858 | -0.5155996 |
| 0 | -1.5093885 | -0.2480561 | -1.1747757 | -0.6709664 | -0.9119619 | -0.6666802 | -0.3518966 | -0.3103085 | -0.5564974 | -1.1376249 | -0.3766131 |
| 0 | -0.2231059 | 1.6406179 | -1.2185069 | -0.2823113 | -0.0158492 | -0.6666802 | 1.0823475 | 0.1703081 | 1.3519931 | -0.9725529 | 3.7977272 |
| 0 | 1.4693711 | -0.3059173 | 0.1601081 | -1.0596214 | -0.0158492 | 0.2375055 | -0.3518966 | -0.3103085 | -0.4839642 | -0.4008472 | -0.7243528 |
setwd("/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica")
DD <- read.csv("DataDictionary.csv", sep = ';')
DD %>% kable(row.names = FALSE)
| Variable.Name | Description | Type |
|---|---|---|
| SD | SeriousDlqin2yrs: person experienced 90 days past due delinquency or worse | Y/N |
| Age | Age of borrower in years | integer |
| RUUL | Revolving Utilization of Unsecured Lines: Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits | percentage |
| DebtRatio | Monthly debt payments, alimony, living costs divided by monthy gross income | percentage |
| OpenCredit | Number of Open loans (installment like car loan or mortgage) and Lines of credit (e.g. credit cards) | integer |
| REloans | Number of mortgage and real estate loans including home equity lines of credit | integer |
| Dep | Number of dependents in family excluding themselves (spouse, children etc.) | integer |
| no3059 | Number of times borrower has been 30-59 days past due but no worse in the last 2 years. | integer |
| LateDays | The sum of the number of days a person was late with payement. | integer |
| logIncome | Logarithm of monthly income | real |
| Debt | Monthly debt payments, alimony, living costs | real |
| Slack | What a person has left when he subtracts his debt from his income. | real |
apply(mydata,2,mean)
## SD Age RUUL DebtRatio OpenCredit
## 6.684000e-02 -2.155537e-17 -2.037136e-16 -1.772317e-16 4.476466e-17
## REloans Dep no3059 LateDays logIncome
## -1.825765e-16 -6.843217e-16 -7.003043e-16 1.935481e-16 8.727115e-17
## Debt Slack
## -3.314229e-17 -3.302383e-17
apply(mydata,2,var)
## SD Age RUUL DebtRatio OpenCredit REloans
## 0.06237283 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000
## Dep no3059 LateDays logIncome Debt Slack
## 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000
This data has already been centered and scaled for the “preparacion de data” assignment.
Because the data is highly unbalanced, I will take a balanced sample with 20k datapoints.
# Split the dataset
mydata_split <- split(mydata, as.factor(mydata$SD))
SD0 <- mydata_split$'0'
SD1 <- mydata_split$'1'
n <- nrow(SD1)
# Sample from SD0
set.seed(42)
SD0_sample10k <- sample_n(SD0, n)
# Merge into balanced new dataset
mydata_balanced <- rbind(SD0_sample10k, SD1)
mydata <- mydata_balanced[sample(2*n),]
summary(mydata)
## SD Age RUUL DebtRatio
## Min. :0.0 Min. :-2.1187 Min. :-0.9038 Min. :-1.3165
## 1st Qu.:0.0 1st Qu.:-0.9001 1st Qu.:-0.6989 1st Qu.:-0.6067
## Median :0.5 Median :-0.2231 Median : 0.3468 Median : 0.0000
## Mean :0.5 Mean :-0.2034 Mean : 0.4975 Mean : 0.1091
## 3rd Qu.:1.0 3rd Qu.: 0.4539 3rd Qu.: 1.7440 3rd Qu.: 0.4619
## Max. :1.0 Max. : 3.2972 Max. : 2.8884 Max. : 6.1656
## OpenCredit REloans Dep
## Min. :-1.64260 Min. :-0.91196 Min. :-0.66668
## 1st Qu.:-0.86529 1st Qu.:-0.91196 1st Qu.:-0.66668
## Median :-0.28231 Median :-0.01585 Median :-0.66668
## Mean :-0.05037 Mean :-0.01527 Mean : 0.08252
## 3rd Qu.: 0.49500 3rd Qu.: 0.88026 3rd Qu.: 0.23750
## Max. : 9.43407 Max. :17.01029 Max. : 8.37518
## no3059 LateDays logIncome
## Min. :-0.3519 Min. :-0.3103 Min. :-10.54352
## 1st Qu.:-0.3519 1st Qu.:-0.3103 1st Qu.: -0.46865
## Median :-0.3519 Median :-0.3103 Median : 0.18046
## Mean : 0.4717 Mean : 0.6845 Mean : -0.07508
## 3rd Qu.: 1.0823 3rd Qu.: 1.1315 3rd Qu.: 0.33475
## Max. :18.2933 Max. :24.2011 Max. : 1.35199
## Debt Slack
## Min. :-1.21093 Min. :-4.5786
## 1st Qu.:-0.76608 1st Qu.:-0.7813
## Median : 0.04979 Median :-0.1115
## Mean : 0.03707 Mean :-0.1466
## 3rd Qu.: 0.35207 3rd Qu.: 0.1544
## Max. :12.61682 Max. : 3.9328
Because of sampling the data isn’t scaled anymore.
set.seed(42)
pca_mydata <- prcomp(mydata[,-1], scale. = F)
Cols=function(vec){cols=c("green", "red")
+ return(cols[as.numeric(as.factor(vec))]) }
labels <- mydata$SD
par(mfrow=c(1,1))
plot(pca_mydata$x[,1:2], col=Cols(labels))
title(main = "PCA without scaling the datasample, 20k datapoints")
The red points are the people with a serious delinquency. The variance of these points (SD=1) is a lot bigger than the green points (SD=0).
biplot(pca_mydata, xlabs=rep(".", nrow(mydata)))
title(main = "biplot of PCA without scaling the datasample, 20k datapoints")
LateDays, no3059 and RUUL are important variables in PC1.
Debt, REloans, DebtRatio and OpenCredit are important variable in PC2. We can see this is the rotation matrix as well:
pca_mydata$rotation %>% kable(row.names = T)
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0.0789108 | -0.0962894 | -0.2250895 | -0.3616954 | 0.3120379 | -0.0174741 | 0.7979091 | -0.0773024 | 0.2481587 | 0.0077937 | -0.0325819 |
| RUUL | -0.2602313 | 0.1187996 | 0.2649503 | 0.3731773 | -0.2761405 | 0.5577852 | 0.3144218 | -0.4430075 | 0.1557705 | -0.0303642 | 0.0320215 |
| DebtRatio | -0.0790184 | -0.4479517 | 0.4375161 | -0.0111380 | 0.0844829 | 0.0035208 | 0.1810887 | 0.2115911 | -0.3028204 | -0.2141888 | 0.6158410 |
| OpenCredit | 0.0308925 | -0.3994792 | -0.1328818 | -0.1748884 | 0.0409093 | -0.1594973 | -0.1762682 | -0.8045933 | -0.2891627 | -0.0365625 | -0.0266978 |
| REloans | 0.0149449 | -0.5031886 | -0.0511274 | 0.1158610 | 0.0694128 | 0.0412832 | -0.2856727 | 0.0193859 | 0.7762022 | -0.1904019 | 0.0573602 |
| Dep | -0.0535120 | -0.0767410 | -0.0462910 | 0.4751677 | -0.3901127 | -0.7119685 | 0.3132208 | -0.0261826 | 0.0602562 | -0.0318648 | -0.0281789 |
| no3059 | -0.5441232 | -0.1458542 | -0.1779235 | -0.5226752 | -0.5941675 | 0.0360914 | -0.0125418 | 0.1494060 | 0.0284371 | 0.0114374 | -0.0073384 |
| LateDays | -0.7822205 | 0.0970139 | -0.1147108 | 0.1873515 | 0.5487051 | -0.1528488 | -0.0713505 | -0.0187313 | -0.0219470 | 0.0098104 | 0.0007204 |
| logIncome | 0.0319078 | -0.1742720 | -0.5051379 | 0.2601027 | -0.0257921 | 0.2749603 | 0.0793516 | 0.1882096 | -0.3101064 | -0.6212698 | -0.2136899 |
| Debt | -0.0286952 | -0.5396560 | -0.0006506 | 0.2247357 | 0.0324220 | 0.1945535 | 0.0887371 | 0.2130386 | -0.1877077 | 0.6264435 | -0.3709634 |
| Slack | 0.0773771 | 0.0706979 | -0.6044181 | 0.1831596 | -0.0687971 | 0.1280452 | -0.0384856 | -0.0329381 | 0.0051310 | 0.3686470 | 0.6561435 |
# Percent Variance Explained
pr.var = pca_mydata$sdev^2
pve = pr.var/sum(pr.var)
pve
## [1] 0.32005368 0.20384070 0.11756412 0.08471263 0.07862810 0.05886057
## [7] 0.04138278 0.03665455 0.03382128 0.01231714 0.01216444
P1 explains 32% of the variance in the data and CP2 20%.
par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Percent Variance Explained", ylim=c(0,1), type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Accumulated Percent Variance Explained",
ylim=c(0,1), type='b', col='blue') + abline(h=0.75, col='red')
## integer(0)
We can see in the figure that with 5 principal components we can describe more than 75% of the variance in the data, which is used as a rule of thumb in practice.
set.seed(42)
pca_mydata <- prcomp(mydata[,-1], scale. = T)
par(mfrow=c(1,1))
plot(pca_mydata$x[,1:2], col=Cols(labels))
title(main = "PCA after scaling the datasample, 20k datapoints")
We can see that scaling makes a big difference.
biplot(pca_mydata, xlabs=rep(".", nrow(mydata)))
title(main = "biplot of PCA after scaling the datasample")
In the scaled case the principal components consists of a different combination of the original variables, which we can also see in the rotation matrix:
pca_mydata$rotation %>% kable(row.names = T)
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0.1338299 | -0.2799918 | 0.1040284 | 0.5863831 | 0.2563284 | 0.6616306 | -0.1727690 | 0.1117026 | -0.0498581 | 0.0170975 | 0.0215189 |
| RUUL | -0.1261597 | 0.4210614 | -0.2631900 | -0.1209986 | -0.3989183 | 0.2999257 | -0.6543846 | 0.2040655 | -0.0377837 | 0.0131549 | -0.0482449 |
| DebtRatio | 0.4182097 | 0.3320304 | 0.2504661 | -0.0328654 | -0.0819929 | 0.1929248 | 0.1071547 | -0.3473466 | 0.0117690 | -0.3056489 | -0.6175830 |
| OpenCredit | 0.4340212 | -0.0857793 | 0.0311074 | 0.1453512 | 0.2596008 | -0.4756121 | -0.6010691 | -0.0908545 | 0.3405589 | 0.0550962 | -0.0043384 |
| REloans | 0.4909171 | -0.0079647 | -0.0016664 | -0.0747607 | -0.0834084 | -0.0959248 | 0.1922375 | 0.7883077 | -0.1445887 | 0.1602319 | -0.1795049 |
| Dep | 0.0823588 | 0.0850987 | -0.2545554 | -0.5854808 | 0.6977889 | 0.2962234 | -0.0381815 | 0.0020099 | -0.0031665 | 0.0474922 | -0.0019825 |
| no3059 | 0.0757403 | 0.3543293 | -0.3957709 | 0.3869452 | 0.2362349 | -0.2554952 | 0.0335292 | -0.1353367 | -0.6491211 | -0.0043596 | 0.0212324 |
| LateDays | -0.0417371 | 0.3925937 | -0.4123410 | 0.3224337 | 0.0833002 | 0.0376116 | 0.3313530 | 0.1198954 | 0.6611620 | -0.0189046 | 0.0111243 |
| logIncome | 0.2389017 | -0.3379409 | -0.4755298 | -0.0535116 | -0.2891626 | 0.1175131 | 0.0828085 | -0.3544060 | 0.0175962 | 0.5725837 | -0.2041637 |
| Debt | 0.5372637 | 0.0662357 | -0.0713837 | -0.1163259 | -0.2377515 | 0.1629084 | 0.1017526 | -0.1622569 | -0.0137154 | -0.2728545 | 0.7028156 |
| Slack | -0.0437274 | -0.4702489 | -0.4832422 | -0.0290669 | -0.0655098 | -0.0697987 | -0.0512108 | 0.1045021 | -0.0077873 | -0.6873645 | -0.2175976 |
This time LateDays, no3059 and RUUL are mapped into PC2.
# Percent Variance Explained
pr.var = pca_mydata$sdev^2
pve = pr.var/sum(pr.var)
pve
## [1] 0.24802228 0.20157463 0.15011676 0.10426470 0.07586700 0.05980580
## [7] 0.04673449 0.04179853 0.03818827 0.01864291 0.01498462
This time P1 explains 25% of the variance in the data and CP2 20%.
par(mfrow=c(1,2))
plot(pve, xlab="Principal Component", ylab="Percent Variance Explained", ylim=c(0,1), type='b')
plot(cumsum(pve), xlab="Principal Component", ylab="Accumulated Percent Variance Explained",
ylim=c(0,1), type='b', col='blue') + abline(h=0.75, col='red')
## integer(0)
Again we see that we need 5 principal components to describe more than 75% of the variance.
gr <- factor(mydata[,1])
colors = mapvalues(gr, from = c(0, 1), to = c("green", "red"))
summary(gr)
## 0 1
## 10026 10026
pca3d(pca_mydata, group = gr, col = colors)
## [1] 0.2727411 0.1550222 0.1535694
## Creating new device
snapshotPCA3d(file="PCA3D_20k.png")
I now take a smaller sample, which is important for using t-SNE later on.
For this sample I take 500 datapoints with SD=0 and 500 datapoints with SD=1.
set.seed(42)
SD0_sample_500 <- sample_n(SD0, 500)
SD1_sample_500 <- sample_n(SD1, 500)
mydata_balanced_1k <- rbind(SD0_sample_500, SD1_sample_500)
mydata <- mydata_balanced_1k[sample(1000),]
summary(mydata)
## SD Age RUUL DebtRatio
## Min. :0.0 Min. :-2.0510 Min. :-0.9038 Min. :-1.3165
## 1st Qu.:0.0 1st Qu.:-0.9001 1st Qu.:-0.7070 1st Qu.:-0.5593
## Median :0.5 Median :-0.2231 Median : 0.3354 Median : 0.0000
## Mean :0.5 Mean :-0.1811 Mean : 0.4848 Mean : 0.1580
## 3rd Qu.:1.0 3rd Qu.: 0.3862 3rd Qu.: 1.6450 3rd Qu.: 0.5456
## Max. :1.0 Max. : 2.8911 Max. : 2.8884 Max. : 6.0776
## OpenCredit REloans Dep no3059
## Min. :-1.64260 Min. :-0.91196 Min. :-0.6667 Min. :-0.3519
## 1st Qu.:-0.67097 1st Qu.:-0.91196 1st Qu.:-0.6667 1st Qu.:-0.3519
## Median :-0.08798 Median :-0.01585 Median :-0.6667 Median :-0.3519
## Mean : 0.01015 Mean : 0.01193 Mean : 0.1453 Mean : 0.5474
## 3rd Qu.: 0.49500 3rd Qu.: 0.88026 3rd Qu.: 1.1417 3rd Qu.: 1.0823
## Max. : 7.87944 Max. : 9.84139 Max. : 6.5668 Max. :11.1221
## LateDays logIncome Debt
## Min. :-0.3103 Min. :-10.54352 Min. :-1.2109
## 1st Qu.:-0.3103 1st Qu.: -0.40410 1st Qu.:-0.7249
## Median :-0.3103 Median : 0.21230 Median : 0.0596
## Mean : 0.7860 Mean : -0.02264 Mean : 0.1020
## 3rd Qu.: 1.1315 3rd Qu.: 0.40187 3rd Qu.: 0.4679
## Max. :15.0694 Max. : 1.35199 Max. : 7.5926
## Slack
## Min. :-2.68051
## 1st Qu.:-0.75851
## Median :-0.08469
## Mean :-0.10551
## 3rd Qu.: 0.25036
## Max. : 3.93283
set.seed(42)
pca_mydata <- prcomp(mydata[,-1], scale. = T)
gr <- factor(mydata[,1])
colors = mapvalues(gr, from = c(0, 1), to = c("green", "red"))
summary(gr)
## 0 1
## 500 500
pca3d(pca_mydata, group = gr, show.ellipses=TRUE, ellipse.ci=0.75, show.plane=F, col = colors)
## [1] 0.1760447 0.1136925 0.1375280
snapshotPCA3d(file="PCA3D_1k.png")
The TSNE algorithm can be used to map high dimensional data sets onto a 2 or 3 dimensional graph.
I’ll use the reduced dataset, because when the number of observations is too large (more than 10.000), the algortihm takes a lot of time and the visualizaton doesn’t work.
A feature of t-SNE is a tuneable parameter “perplexity”, which says (loosely) how to balance attention between local and global aspects of your data. The parameter is, in a sense, a guess about the number of close neighbors each point has. I will vary the perplexity parameter, to see what effect it has on my data.
mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 2, perplexity=30, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 1000
## Done in 0.10 seconds (sparsity = 0.128138)!
## Learning embedding...
## Iteration 50: error is 69.757288 (50 iterations in 0.46 seconds)
## Iteration 100: error is 67.910138 (50 iterations in 0.45 seconds)
## Iteration 150: error is 67.901686 (50 iterations in 0.41 seconds)
## Iteration 200: error is 67.901316 (50 iterations in 0.42 seconds)
## Iteration 250: error is 67.901649 (50 iterations in 0.42 seconds)
## Iteration 300: error is 1.462661 (50 iterations in 0.41 seconds)
## Iteration 350: error is 1.288151 (50 iterations in 0.42 seconds)
## Iteration 400: error is 1.242164 (50 iterations in 0.42 seconds)
## Iteration 450: error is 1.220981 (50 iterations in 0.42 seconds)
## Iteration 500: error is 1.211245 (50 iterations in 0.42 seconds)
## Fitting performed in 4.25 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per30.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=30")
mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 2, perplexity=50, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 1000
## Done in 0.17 seconds (sparsity = 0.213184)!
## Learning embedding...
## Iteration 50: error is 63.671460 (50 iterations in 0.51 seconds)
## Iteration 100: error is 63.154005 (50 iterations in 0.53 seconds)
## Iteration 150: error is 62.757511 (50 iterations in 0.46 seconds)
## Iteration 200: error is 62.757626 (50 iterations in 0.46 seconds)
## Iteration 250: error is 62.757590 (50 iterations in 0.46 seconds)
## Iteration 300: error is 1.274432 (50 iterations in 0.45 seconds)
## Iteration 350: error is 1.146637 (50 iterations in 0.45 seconds)
## Iteration 400: error is 1.109491 (50 iterations in 0.45 seconds)
## Iteration 450: error is 1.097372 (50 iterations in 0.45 seconds)
## Iteration 500: error is 1.092265 (50 iterations in 0.44 seconds)
## Fitting performed in 4.66 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per50.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=50")
mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 2, perplexity=100, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 2, perplexity = 100.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 1000
## Done in 0.36 seconds (sparsity = 0.418520)!
## Learning embedding...
## Iteration 50: error is 55.392553 (50 iterations in 0.61 seconds)
## Iteration 100: error is 55.392553 (50 iterations in 0.58 seconds)
## Iteration 150: error is 55.392553 (50 iterations in 0.57 seconds)
## Iteration 200: error is 55.392553 (50 iterations in 0.60 seconds)
## Iteration 250: error is 55.392553 (50 iterations in 0.62 seconds)
## Iteration 300: error is 0.978117 (50 iterations in 0.62 seconds)
## Iteration 350: error is 0.859854 (50 iterations in 0.55 seconds)
## Iteration 400: error is 0.842660 (50 iterations in 0.53 seconds)
## Iteration 450: error is 0.836733 (50 iterations in 0.54 seconds)
## Iteration 500: error is 0.833816 (50 iterations in 0.54 seconds)
## Fitting performed in 5.77 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per100.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=100")
When I set perplexity=100 it gives the best separation of the red and green points.
mydata_matrix <- as.matrix(unique(mydata[,-1]))
set.seed(42)
tsne_out <- Rtsne(mydata_matrix, dims = 3, perplexity=100, verbose=TRUE, max_iter = 500) # Run TSNE
## Read the 1000 x 11 data matrix successfully!
## Using no_dims = 3, perplexity = 100.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
## - point 0 of 1000
## Done in 0.36 seconds (sparsity = 0.418520)!
## Learning embedding...
## Iteration 50: error is 55.392553 (50 iterations in 0.94 seconds)
## Iteration 100: error is 55.392553 (50 iterations in 0.85 seconds)
## Iteration 150: error is 55.392553 (50 iterations in 0.87 seconds)
## Iteration 200: error is 55.392553 (50 iterations in 0.91 seconds)
## Iteration 250: error is 55.392553 (50 iterations in 0.96 seconds)
## Iteration 300: error is 0.816494 (50 iterations in 0.94 seconds)
## Iteration 350: error is 0.703511 (50 iterations in 0.87 seconds)
## Iteration 400: error is 0.683349 (50 iterations in 0.91 seconds)
## Iteration 450: error is 0.672979 (50 iterations in 0.88 seconds)
## Iteration 500: error is 0.668926 (50 iterations in 0.88 seconds)
## Fitting performed in 9.01 seconds.
save(tsne_out, file="/Users/Jurjen/OneDrive - U-tad/ReduccionDimensiones/Practica/tsne_out_per100.RData")
plot(tsne_out$Y, col=Cols(mydata$SD))
title(main = "t-SNE, perplexity=100, dims=3")
scatterplot3d(x=tsne_out$Y[,1],y=tsne_out$Y[,2],z=tsne_out$Y[,3], color = Cols(mydata$SD))
title(main = "t-SNE, perplexity=100, dims=3")
plot3d(x=tsne_out$Y[,1],y=tsne_out$Y[,2],z=tsne_out$Y[,3],
col = Cols(mydata$SD),
type="s",radius=0.5)
This visualization with t-SNE shows that it should be reasonably possible to separate the data in a higher dimension (in my case 11 dimensions). Although a fraction of the points will be misclassified.
From this analysis I would say that it seems like a good idea to keep the 11 variables that I obtained after data preparacion and try to use a nonlinear algorithm (like a neural network) to predict the probability that somebody will experience financial distress in the next two years. I have a lot of data to train the network on. Although it would be interesting to compare with the results of linear classification, either with PCA or keeping only the most ‘important’ variables. Because PCA suggests (looking at the 3D graph of 20.000 datapoints) that up to a certain accuracy the data are linearly separable.
The idea of kernelPCA is to first transform the data to another space using a kernel and do PCA in that space.
I will use a Radial Basis Kernel, and look at different values of sigma.
kpc <- kpca(~., data = mydata[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 2)
plot(rotated(kpc), col=Cols(mydata$SD))
title(main = "KernelPCA (without scaling the datasample), Radial Basis kernel, sigma=0.0001")
kpc <- kpca(~., data = mydata[,-1], kernel = "rbfdot", kpar = list(sigma = 0.01), features = 2)
plot(rotated(kpc), col=Cols(mydata$SD))
title(main = "KernelPCA (without scaling the datasample), Radial Basis kernel, sigma=0.01")
kpc <- kpca(~., data = mydata[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 3)
plot3d(x=rotated(kpc)[,1],y=rotated(kpc)[,2],z=rotated(kpc)[,3],
col = Cols(mydata$SD),
type="s",radius=0.1)
I thought that scaling has to occur after applying the kernel, but I’m not sure if kpca does this automatically. Let’s see what happens when we scale the data first.
trans = preProcess(mydata[,-1],
method=c("center", "scale"))
mydata_norm = predict(trans, mydata)
kpc <- kpca(~., data = mydata_norm[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 2)
plot(rotated(kpc), col=Cols(mydata$SD))
title(main = "KernelPCA (after scaling datasample), Radial Basis kernel, sigma=0.0001")
kpc <- kpca(~., data = mydata_norm[,-1], kernel = "rbfdot", kpar = list(sigma = 0.0001), features = 3)
plot3d(x=rotated(kpc)[,1],y=rotated(kpc)[,2],z=rotated(kpc)[,3],
col = Cols(mydata$SD),
type="s",radius=0.1)
I’m not sure if scaling before kpca makes things better…